# Tables 6,7 and Figures S1-S6 in "Evaluating (weighted) dynamic treatment effects 
# by double machine learning" by Hugo Bodory, Martin Huber, and Luk� Laff�rs 

# To replicate the results for effect estimations and propensity score densities,
# the variables trim_threshold, d1d2_treated, and d1d2_controls (lines 118-120) 
# have to be adapted accordingly. Line 110 reads the dataset data.csv. 

rm(list = ls())

packages = c('SuperLearner','e1071','glmnet','ranger','xgboost','ggplot2','dplyr','parallel')
lapply(packages,FUN=function(packages) {do.call("require", list(packages))})

# SuperLearner algorithms
MLfunct=function(y, x, d1=NULL, d2=NULL, MLmethod="lasso",  ybin=0){
  if (is.null(d1)==0 & is.null(d2)==0) { y=y[d1==1 & d2==1]; x=x[d1==1 & d2==1,]}
  if (is.null(d1)==0 & is.null(d2)==1) { y=y[d1==1]; x=x[d1==1,]}
  cl <- makeCluster(16, type = "PSOCK")  # can use different types here
  clusterSetRNGStream(cl, iseed = 2343)
  # make SL functions available on the clusters
  foo <- clusterEvalQ(cl, library(SuperLearner))  
  if  (MLmethod=="randomforest"){ 
    if (ybin==1) model=snowSuperLearner(cluster=cl,Y=y,X=x,family=binomial(),SL.library = "SL.ranger",verbose=TRUE) 
    if (ybin!=1) model=snowSuperLearner(cluster=cl,Y=y,X=x,family=gaussian(),SL.library = "SL.ranger",verbose=TRUE) 
  }
  if  (MLmethod=="lasso"){ 
    if (ybin==1) model=snowSuperLearner(cluster=cl,Y=y,X=x,family=binomial(),SL.library = "SL.glmnet",verbose=TRUE) 
    if (ybin!=1) model=snowSuperLearner(cluster=cl,Y=y,X=x,family=gaussian(),SL.library = "SL.glmnet",verbose=TRUE) 
  }
  stopCluster(cl)  
  model        
}

# dynamic treatment effect estimation
dyntreatDML=function(y2,d1,d2,x0,x1, s=NULL, d1treat=1, d2treat=1, d1control=0, d2control=0,  trim=0.01, MLmethod="lasso", fewsplits=FALSE){
  if (length(d1treat)==1) {d1tre=1*(d1==d1treat)} else {d1tre=d1treat}
  if (length(d2treat)==1) {d2tre=1*(d2==d2treat)} else {d2tre=d2treat}
  if (length(d1control)==1) {d1con=1*(d1==d1control)} else {d1con=d1control}
  if (length(d2control)==1) {d2con=1*(d2==d2control)} else {d2con=d2control}
  scorestreat=hddyntreat(y2=y2,d1=d1tre,d2=d2tre,x0=x0,x1=x1, s=s, trim=trim, MLmethod=MLmethod, fewsplits=fewsplits)
  scorescontrol=hddyntreat(y2=y2,d1=d1con,d2=d2con,x0=x0,x1=x1, s=s, trim=trim, MLmethod=MLmethod, fewsplits=fewsplits) 
  trimmed=1*(scorescontrol[,10]+scorestreat[,10]>0)  # number of trimmed observations
  scorestreat=scorestreat[trimmed==0,]
  scorescontrol=scorescontrol[trimmed==0,]
  tscores=(scorestreat[,1]*scorestreat[,2]*scorestreat[,3]*(scorestreat[,4]-scorestreat[,5])/(scorestreat[,6]*scorestreat[,7])+scorestreat[,1]*scorestreat[,2]*(scorestreat[,5]-scorestreat[,8])/scorestreat[,6]+scorestreat[,9]*scorestreat[,8])/mean(scorestreat[,9])
  cscores=(scorescontrol[,1]*scorescontrol[,2]*scorescontrol[,3]*(scorescontrol[,4]-scorescontrol[,5])/(scorescontrol[,6]*scorescontrol[,7])+scorescontrol[,1]*scorescontrol[,2]*(scorescontrol[,5]-scorescontrol[,8])/scorescontrol[,6]+scorescontrol[,9]*scorescontrol[,8])/mean(scorescontrol[,9])
  meantreat=mean(tscores)
  meancontrol=mean(cscores)
  effect=meantreat - meancontrol
  se=sqrt(mean((tscores-cscores-effect)^2)/length(tscores))
  pval= 2*pnorm((-1)*abs(effect/se))
  idx_treat_without_trimmed = d1tre[trimmed==0]&d2tre[trimmed==0]
  idx_control_without_trimmed = d1con[trimmed==0]&d2con[trimmed==0]
  list(effect=effect,se=se,pval=pval,ntrimmed=sum(trimmed),meantreat=meantreat,meancontrol=meancontrol,
       psd1treat=scorestreat[,6],psd2treat=scorestreat[,7],psd1control=scorescontrol[,6],psd2control=scorescontrol[,7],
       idx_treat_without_trimmed=idx_treat_without_trimmed,idx_control_without_trimmed=idx_control_without_trimmed)
}

# scores for dynamic treatment effect estimation
hddyntreat=function(y2, d1, d2, x0 , x1, s=NULL, trim=0.05, MLmethod="lasso", fewsplits=fewsplits){
  ybin=1*(length(unique(y2))==2 & min(y2)==0 & max(y2)==1)  # check if binary outcome
  x0=data.frame(x0); x0x1=data.frame(x0,x1); d1x0x1=data.frame(d1,x0x1);
  # cross-fitting procedure that splits sample in training an testing data
  stepsize=ceiling((1/3)*length(y2))
  nobs = min(3*stepsize,length(y2)); set.seed(1); idx = sample(nobs);
  sample1 = idx[1:stepsize]; sample2 = idx[(stepsize+1):(2*stepsize)]; sample3 = idx[(2*stepsize+1):nobs];
  score=c(); sel=c(); trimmed=c()
  for (i in 1:3){
    if (i==3) {trsample1=sample1; trsample2=sample2; tesample=sample3}
    if (i==1) {trsample1=sample2; trsample2=sample3; tesample=sample1}
    if (i==2) {trsample1=sample3; trsample2=sample1; tesample=sample2}
    # total training sample
    trsample=c(trsample1,trsample2)
    # in case that fewsplits is one, both training data are merged
    if (fewsplits==1){trsample1=c(trsample1,trsample2);trsample2=trsample1}
    if (is.null(s)) {gte=rep(1,length(tesample)); ste=gte} #check if weighted estimation should be performed
    if (is.null(s)==0) {
      g=MLfunct(y=s[trsample], x=x0[trsample,], MLmethod=MLmethod,  ybin=1)
      gte=predict(g, x0[tesample,], onlySL = TRUE)$pred     #predict weighting function in test data
      ste=s[tesample]
    }
    # nuisance parameters
    p1=MLfunct(y=d1[trsample], x=x0[trsample,], MLmethod=MLmethod,  ybin=1)
    p1te=predict(p1, x0[tesample,], onlySL = TRUE)$pred  # predict ps1 in test data
    p2=MLfunct(y=d2[trsample], x=d1x0x1[trsample,], MLmethod=MLmethod, ybin=1)
    p2te=predict(p2, d1x0x1[tesample,], onlySL = TRUE)$pred  # predict ps2 in test data
    y2d1d2=MLfunct(y=y2[trsample1], x=x0x1[trsample1,], d1=d1[trsample1], d2=d2[trsample1], MLmethod=MLmethod, ybin=ybin)
    y2d1d2te=predict(y2d1d2,  x0x1[tesample,], onlySL = TRUE)$pred  # predict E[Y2|D1,D2,X0,X1] in test data
    y2d1d2tr2=predict(y2d1d2, x0x1[trsample2,], onlySL = TRUE)$pred  # predict E[Y2|D1,D2,X0,X1] in second training data
    y1d1=MLfunct(y=y2d1d2tr2, x=x0[trsample2,], d1=d1[trsample2], MLmethod=MLmethod, ybin=0)
    y1d1te=predict(y1d1, x0[tesample,], onlySL = TRUE)$pred  # predict E[E[Y2|D1,D2,X0,X1]|D1,X0] in test data
    trimmed=1*((p1te*p2te)<trim)  # observations not satisfying trimming restriction
    score=rbind(score, cbind(gte,d1[tesample],d2[tesample],y2[tesample],y2d1d2te,p1te,p2te,y1d1te,ste,trimmed))
  }
  score = score[order(idx),]
  score
}

# kernel density plots
plot_density <- function(p1,p2,title="Propensity scores: Treated vs. Controls"){
  density_p1=density(p1)
  density_p2=density(p2)
  max_ylim=max(density_p1$y,density_p2$y)
  plot(density_p1, xlim=c(-0.01,1.01), ylim=c(0, max_ylim+0.5),
       col="blue", main=title, xlab="Propensity score")
  lines(density_p2, col="red")
  legend("topright",legend=c("treated","controls"),col=c("blue", "red"),lty=1,bg="transparent")
}

# data
df = read.csv(file='data.csv', header=TRUE)
r = df[,1] # random assignment indicator, not used for tables 6+7
d = df[,2:3] # d1,d2: treatments in 2 periods
y = df[,4] # employment: outcome
x0 = df[,5:913] # x0: covariates
x1 = df[,914:ncol(df)] # x1: covariates

# parameters 
trim_threshold = 0.01  # trimming threshold
d1d2_treated = c(3,3)  # d1,d2 for treated
d1d2_controls = c(1,1)  # d1,d2 for controls

# data frame specifying treatment values
dmat = data.frame(matrix(c(d1d2_treated, d1d2_controls), nrow=1, ncol=4))
colnames(dmat) = c('d1tr','d2tr','d1co','d2co')

# data frame for effect estimation shown in tables 6+7
table_6_or_7 = data.frame(matrix(0, nrow=1, ncol=8))
colnames(table_6_or_7) = c('tr','co','y0','eff','se','p','n','trim')

# index for subset of data as stated in the notes of tables 6+7 (S=1)
# drop observations with missing outcome and treatment values
idx = (y!=-1) & (d[,1]==dmat[1,'d1tr']|d[,1]==dmat[1,'d1co']) & (d[,2]>=0)

# subset of data for effect estimation
y = y[idx]
d1 = d[idx,1]
d2 = d[idx,2]
x0 = x0[idx,]
x1 = x1[idx,] 

# effect estimation
results = dyntreatDML(y,d1,d2,x0,x1,s=NULL,d1treat=dmat[1,1],d2treat=dmat[1,2],
                      d1control=1*(d1==dmat[1,3]),d2control=1*(d2==dmat[1,4]),
                      trim=trim_threshold,MLmethod="randomforest",fewsplits=FALSE)

# results presented in tables 6+7
table_6_or_7[1,1:8] = c(dmat[1,1]*10 + dmat[1,2], dmat[1,3]*10 + dmat[1,4],
                        round(results$meancontrol, 2), round(results$effect, 2), 
                        round(results$se, 2), round(results$pval, 2), length(y),
                        results$ntrimmed)
print(table_6_or_7)

# propensity scores presented in figures S1-S6 in the appendix
ps = list(results$psd1treat,results$psd2treat,results$psd1control,results$psd2control)
idx_t = results$idx_treat_without_trimmed; idx_c = results$idx_control_without_trimmed
titles = c(
  paste("Propensity score for treatment=",dmat[1]," in period 1",sep=""),
  paste("Propensity score for treatment=",dmat[2]," in period 2\ngiven treatment=",dmat[1]," in period 1",sep=""),
  paste("Propensity score for treatment=",dmat[3]," in period 1",sep=""),
  paste("Propensity score for treatment=",dmat[4]," in period 2\ngiven treatment=",dmat[3]," in period 1",sep="")
  )
for (k in 1:4){
  plot_density(ps[[k]][idx_t],ps[[k]][idx_c],title=titles[k])
}
